#==============================================================================#
# 1-extract-profiles-bd.R
# Matt Curtis 27/06/22
# mjdcurtis@gmail.com
#------------------------------------------------------------------------------#
# 
#==============================================================================#
library(arrow)
library(tidyverse)
library(data.table)
library(tidytable)
library(tictoc)

#------------------------------------------------------------------------------#
rm(list = ls())
gc()

# set working directory once

setwd("~/Replication package/")
csv_file <- "./1_raw_data/1_2_geni/geni_unions.csv"
dest <- "./1_raw_data/1_2_geni/1_2_geni_chunked/geni_unions" 


sch<-schema(
  union_id=string(),
  profile_id=string(),
  union_type=string(),
  link_type=string(),
  position=string(),
  link_modifier=string()
)

csv_stream <- open_csv_dataset(csv_file,schema=sch,skip=1)

write_dataset(csv_stream, dest, format = "parquet", 
              max_rows_per_file=10000000L,
              hive_style = TRUE,
              existing_data_behavior = "overwrite")